The datasets contains transactions made by credit cards in September 2013 by european cardholders.
This dataset presents transactions that occurred in two days, where we have 492 frauds out of 284,807 transactions.
The dataset is highly unbalanced, the positive class (frauds) account for 0.172% of all transactions.
It contains only numerical input variables which are the result of a PCA transformation.
Unfortunately, due to confidentiality issues, we cannot provide the original features and more background information about the data.
Features V1, V2, ... V28 are the principal components obtained with PCA, the only features which have not been transformed with PCA are 'Time' and 'Amount'.
Feature 'Time' contains the seconds elapsed between each transaction and the first transaction in the dataset. The feature 'Amount' is the transaction Amount, this feature can be used for example-dependant cost-senstive learning.
Feature 'Class' is the response variable and it takes value 1 in case of fraud and 0 otherwise.m
The term Boosting refers to a family of algorithms which converts weak learner to strong learners.
There are many boosting algorithms:
sklearn.ensemble.GradientBoostingRegressor
xgboost.XGBRegressor # fast and best
lightgbm.LGBMRegressor # extreme fast, little acc than xgb
catboost.CatBoostRegressor # good for categorical feats
import sys
ENV_BHISHAN = None
try:
import bhishan
print('Environment: Personal environment')
ENV_BHISHAN = True
%load_ext autoreload
%autoreload 2
except:
print('Module "bhishan" not found.')
import sys
ENV_COLAB = 'google.colab' in sys.modules
if ENV_COLAB:
#!pip install hpsklearn
!pip install shap eli5
!pip install catboost
!pip install ipywidgets
!jupyter nbextension enable --py widgetsnbextension
# set OMP_NUM_THREADS=1 for hpsklearn package
#!export OMP_NUM_THREADS=1
print('Environment: Google Colab')
import time
notebook_start_time = time.time()
# personal module
import bhishan
from bhishan import bp
import numpy as np
import pandas as pd
SEED = 100
# visualizatioin
import matplotlib.pyplot as plt
plt.rcParams['figure.figsize'] = 8,8
plt.rcParams.update({'font.size': 16})
plt.style.use('ggplot')
%matplotlib inline
import seaborn as sns
sns.set(color_codes=True)
# six and pickle
import six
import pickle
import joblib
# mixed
import copy
import pprint
pp = pprint.PrettyPrinter(indent=4)
# sklearn
import sklearn
# classifiers
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
# scale and split
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
# sklearn scalar metrics
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
# roc auc and curves
from sklearn.metrics import auc
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
from sklearn.metrics import precision_recall_curve
# confusion matrix and classification report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
# boosting
import xgboost, lightgbm, catboost
import xgboost as xgb
import lightgbm as lgb
from sklearn.ensemble import GradientBoostingRegressor
from xgboost import XGBClassifier, DMatrix
from lightgbm import LGBMClassifier, Dataset
from catboost import CatBoostClassifier, Pool, CatBoost
# parameters tuning
from hyperopt import hp, tpe, fmin, Trials, STATUS_OK, STATUS_FAIL
from hyperopt.pyll import scope
from hyperopt.pyll.stochastic import sample
# model intepretation modules
import eli5
import shap
import yellowbrick
import lime
import scikitplot
# version
%load_ext watermark
%watermark -a "Bhishan Poudel" -d -v -m
print()
%watermark -iv
The watermark extension is already loaded. To reload it, use: %reload_ext watermark Bhishan Poudel 2020-09-29 CPython 3.7.7 IPython 7.18.1 compiler : Clang 4.0.1 (tags/RELEASE_401/final) system : Darwin release : 19.6.0 machine : x86_64 processor : i386 CPU cores : 4 interpreter: 64bit eli5 0.10.1 seaborn 0.11.0 json 2.0.9 xgboost 1.1.1 bhishan 0.3.1 joblib 0.16.0 sklearn 0.23.1 numpy 1.18.4 lightgbm 2.3.1 yellowbrick 1.1 optuna 2.0.0 pandas 1.1.0 catboost 0.23.2 six 1.15.0 scikitplot 0.3.7 shap 0.35.0
def show_method_attributes(obj, ncols=7,start=None, inside=None):
""" Show all the attributes of a given method.
Example:
========
show_method_attributes(list)
"""
lst = [elem for elem in dir(obj) if elem[0]!='_' ]
lst = [elem for elem in lst
if elem not in 'os np pd sys time psycopg2'.split() ]
if isinstance(start,str):
lst = [elem for elem in lst if elem.startswith(start)]
if isinstance(start,tuple) or isinstance(start,list):
lst = [elem for elem in lst for start_elem in start
if elem.startswith(start_elem)]
if isinstance(inside,str):
lst = [elem for elem in lst if inside in elem]
if isinstance(inside,tuple) or isinstance(inside,list):
lst = [elem for elem in lst for inside_elem in inside
if inside_elem in elem]
return pd.DataFrame(np.array_split(lst,ncols)).T.fillna('')
df_eval = pd.DataFrame({'Model': [],
'Description':[],
'Accuracy':[],
'Precision':[],
'Recall':[],
'F1':[],
'AUC':[],
})
ifile = 'https://github.com/bhishanpdl/Project_Fraud_Detection/blob/master/data/raw/creditcard.csv.zip?raw=true'
df = pd.read_csv(ifile,compression='zip')
print(df.shape)
df.head()
(284807, 31)
| Time | V1 | V2 | V3 | V4 | V5 | V6 | V7 | V8 | V9 | ... | V21 | V22 | V23 | V24 | V25 | V26 | V27 | V28 | Amount | Class | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0.0 | -1.359807 | -0.072781 | 2.536347 | 1.378155 | -0.338321 | 0.462388 | 0.239599 | 0.098698 | 0.363787 | ... | -0.018307 | 0.277838 | -0.110474 | 0.066928 | 0.128539 | -0.189115 | 0.133558 | -0.021053 | 149.62 | 0 |
| 1 | 0.0 | 1.191857 | 0.266151 | 0.166480 | 0.448154 | 0.060018 | -0.082361 | -0.078803 | 0.085102 | -0.255425 | ... | -0.225775 | -0.638672 | 0.101288 | -0.339846 | 0.167170 | 0.125895 | -0.008983 | 0.014724 | 2.69 | 0 |
| 2 | 1.0 | -1.358354 | -1.340163 | 1.773209 | 0.379780 | -0.503198 | 1.800499 | 0.791461 | 0.247676 | -1.514654 | ... | 0.247998 | 0.771679 | 0.909412 | -0.689281 | -0.327642 | -0.139097 | -0.055353 | -0.059752 | 378.66 | 0 |
| 3 | 1.0 | -0.966272 | -0.185226 | 1.792993 | -0.863291 | -0.010309 | 1.247203 | 0.237609 | 0.377436 | -1.387024 | ... | -0.108300 | 0.005274 | -0.190321 | -1.175575 | 0.647376 | -0.221929 | 0.062723 | 0.061458 | 123.50 | 0 |
| 4 | 2.0 | -1.158233 | 0.877737 | 1.548718 | 0.403034 | -0.407193 | 0.095921 | 0.592941 | -0.270533 | 0.817739 | ... | -0.009431 | 0.798278 | -0.137458 | 0.141267 | -0.206010 | 0.502292 | 0.219422 | 0.215153 | 69.99 | 0 |
5 rows × 31 columns
target = 'Class'
features = df.columns.drop(target)
df[target].value_counts(normalize=True)*100
0 99.827251 1 0.172749 Name: Class, dtype: float64
from sklearn.model_selection import train_test_split
df_Xtrain_orig, df_Xtest, ser_ytrain_orig, ser_ytest = train_test_split(
df.drop(target,axis=1),
df[target],
test_size=0.2,
random_state=SEED,
stratify=df[target])
ytrain_orig = ser_ytrain_orig.to_numpy().ravel()
ytest = ser_ytest.to_numpy().ravel()
print(df_Xtrain_orig.shape)
df_Xtrain_orig.head()
(227845, 30)
| Time | V1 | V2 | V3 | V4 | V5 | V6 | V7 | V8 | V9 | ... | V20 | V21 | V22 | V23 | V24 | V25 | V26 | V27 | V28 | Amount | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 211885 | 138616.0 | -1.137612 | 2.345154 | -1.767247 | 0.833982 | 0.973168 | -0.073571 | 0.802433 | 0.733137 | -1.154087 | ... | 0.062820 | 0.114953 | 0.430613 | -0.240819 | 0.124011 | 0.187187 | -0.402251 | 0.196277 | 0.190732 | 39.46 |
| 12542 | 21953.0 | -1.028649 | 1.141569 | 2.492561 | -0.242233 | 0.452842 | -0.384273 | 1.256026 | -0.816401 | 1.964560 | ... | 0.350032 | -0.380356 | -0.037432 | -0.503934 | 0.407129 | 0.604252 | 0.233015 | -0.433132 | -0.491892 | 7.19 |
| 270932 | 164333.0 | -1.121864 | -0.195099 | 1.282634 | -3.172847 | -0.761969 | -0.287013 | -0.586367 | 0.496182 | -2.352349 | ... | -0.113632 | -0.328953 | -0.856937 | -0.056198 | 0.401905 | 0.406813 | -0.440140 | 0.152356 | 0.030128 | 40.00 |
| 30330 | 35874.0 | 1.094238 | -0.760568 | -0.392822 | -0.611720 | -0.722850 | -0.851978 | -0.185505 | -0.095131 | -1.122304 | ... | 0.354148 | -0.227392 | -1.254285 | 0.022116 | -0.141531 | 0.114515 | -0.652427 | -0.037897 | 0.051254 | 165.85 |
| 272477 | 165107.0 | 2.278095 | -1.298924 | -1.884035 | -1.530435 | -0.649500 | -0.996024 | -0.466776 | -0.438025 | -1.612665 | ... | -0.341708 | 0.123892 | 0.815909 | -0.072537 | 0.784217 | 0.403428 | 0.193747 | -0.043185 | -0.058719 | 60.00 |
5 rows × 30 columns
df_Xtrain, df_Xvalid, ser_ytrain, ser_yvalid = train_test_split(
df_Xtrain_orig,
ser_ytrain_orig,
test_size=0.2,
random_state=SEED,
stratify=ser_ytrain_orig)
ytrain = ser_ytrain.to_numpy().ravel()
yvalid = ser_yvalid.to_numpy().ravel()
print(df_Xtrain.shape)
(182276, 30)
# random undersampling
n = df[target].value_counts().values[-1]
df_under = (df.groupby(target)
.apply(lambda x: x.sample(n,random_state=SEED))
.reset_index(drop=True))
df_Xtrain_orig_under, df_Xtest_under, ser_ytrain_orig_under, ser_ytest_under = train_test_split(
df_under.drop(target,axis=1),
df_under[target],
test_size=0.2,
random_state=SEED,
stratify=df_under[target])
df_Xtrain_under, df_Xvalid_under, ser_ytrain_under, ser_yvalid_under = train_test_split(
df_Xtrain_orig_under,
ser_ytrain_orig_under,
test_size=0.2,
random_state=SEED,
stratify=ser_ytrain_orig_under)
ser_ytrain.value_counts(), ser_ytest.value_counts(), ser_yvalid.value_counts()
(0 181961 1 315 Name: Class, dtype: int64, 0 56864 1 98 Name: Class, dtype: int64, 0 45490 1 79 Name: Class, dtype: int64)
https://catboost.ai/docs/concepts/python-reference_catboostregressor.html
class CatBoostRegressor(
iterations=None, learning_rate=None,
depth=None, l2_leaf_reg=None,
model_size_reg=None, rsm=None,
loss_function='RMSE', border_count=None,
feature_border_type=None, per_float_feature_quantization=None,
input_borders=None, output_borders=None,
fold_permutation_block=None, od_pval=None,
od_wait=None, od_type=None,
nan_mode=None, counter_calc_method=None,
leaf_estimation_iterations=None, leaf_estimation_method=None,
thread_count=None, random_seed=None,
use_best_model=None, best_model_min_trees=None,
verbose=None, silent=None,
logging_level=None, metric_period=None,
ctr_leaf_count_limit=None, store_all_simple_ctr=None,
max_ctr_complexity=None, has_time=None,
allow_const_label=None, one_hot_max_size=None,
random_strength=None,name=None, ignored_features=None,
train_dir=None, custom_metric=None,
eval_metric=None, bagging_temperature=None,
save_snapshot=None, snapshot_file=None,
snapshot_interval=None, fold_len_multiplier=None,
used_ram_limit=None, gpu_ram_part=None,
pinned_memory_size=None, allow_writing_files=None,
final_ctr_computation_mode=None, approx_on_full_history=None,
boosting_type=None, simple_ctr=None,
combinations_ctr=None, per_feature_ctr=None,
ctr_target_border_count=None, task_type=None,
device_config=None, devices=None,
bootstrap_type=None, subsample=None,
sampling_unit=None, dev_score_calc_obj_block_size=None,
max_depth=None, n_estimators=None,
num_boost_round=None, num_trees=None,
colsample_bylevel=None, random_state=None,
reg_lambda=None, objective=None,
eta=None, max_bin=None,
gpu_cat_features_storage=None, data_partition=None,
metadata=None, early_stopping_rounds=None,
cat_features=None, grow_policy=None,
min_data_in_leaf=None, min_child_samples=None,
max_leaves=None, num_leaves=None,
score_function=None, leaf_estimation_backtracking=None,
ctr_history_unit=None, monotone_constraints=None
)
import catboost
show_method_attributes(catboost,2)
| 0 | 1 | |
|---|---|---|
| 0 | CatBoost | Pool |
| 1 | CatBoostClassifier | core |
| 2 | CatBoostError | cv |
| 3 | CatBoostRegressor | sum_models |
| 4 | CatboostError | to_classifier |
| 5 | EFstrType | to_regressor |
| 6 | FeaturesData | train |
| 7 | MetricVisualizer | version |
| 8 | MultiRegressionCustomMetric | widget |
| 9 | MultiRegressionCustomObjective |
from catboost import CatBoostClassifier, Pool
show_method_attributes(CatBoostClassifier,2)
| 0 | 1 | |
|---|---|---|
| 0 | best_iteration_ | get_test_evals |
| 1 | best_score_ | get_text_feature_indices |
| 2 | calc_feature_statistics | get_tree_leaf_counts |
| 3 | calc_leaf_indexes | grid_search |
| 4 | classes_ | is_fitted |
| 5 | compare | iterate_leaf_indexes |
| 6 | copy | learning_rate_ |
| 7 | create_metric_calcer | load_model |
| 8 | drop_unused_features | plot_partial_dependence |
| 9 | eval_metrics | plot_predictions |
| 10 | evals_result_ | plot_tree |
| 11 | feature_importances_ | predict |
| 12 | feature_names_ | predict_log_proba |
| 13 | fit | predict_proba |
| 14 | get_all_params | random_seed_ |
| 15 | get_best_iteration | randomized_search |
| 16 | get_best_score | save_borders |
| 17 | get_borders | save_model |
| 18 | get_cat_feature_indices | score |
| 19 | get_evals_result | set_feature_names |
| 20 | get_feature_importance | set_leaf_values |
| 21 | get_leaf_values | set_params |
| 22 | get_leaf_weights | set_scale_and_bias |
| 23 | get_metadata | shrink |
| 24 | get_object_importance | staged_predict |
| 25 | get_param | staged_predict_log_proba |
| 26 | get_params | staged_predict_proba |
| 27 | get_scale_and_bias | tree_count_ |
| 28 | get_test_eval |
from catboost import CatBoostClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.metrics import accuracy_score, precision_score, recall_score,f1_score
from sklearn.metrics import confusion_matrix
# time
time_start = time.time()
# current parameters
desc = 'default,random_state=100, numpy'
Xtr = df_Xtrain.to_numpy()
ytr = ser_ytrain.to_numpy().ravel()
Xtx = df_Xtest.to_numpy()
ytx = ser_ytest.to_numpy().ravel()
# fit the model
model_cat = CatBoostClassifier(verbose=100,random_state=SEED)
model_cat.fit(Xtr, ytr)
# fitted model
model = model_cat
# save the model
# joblib.dump(model_cat, 'model_cat.pkl')
# model_cat = joblib.load('model_cat.pkl')
# predictions
skf = StratifiedKFold(n_splits=2,shuffle=True,random_state=SEED)
ypreds_cv = cross_val_predict(model_cat, Xtx, ytx, cv=skf)
ypreds = ypreds_cv
# model evaluation
row_eval = ['catboost','default, seed=100',
accuracy_score(ytx, ypreds),
precision_score(ytx, ypreds, average='micro'),
recall_score(ytx, ypreds, average='micro'),
f1_score(ytx, ypreds, average='micro'),
roc_auc_score(ytx, ypreds),
]
df_eval.loc[len(df_eval)] = row_eval
df_eval = df_eval.drop_duplicates()
time_taken = time.time() - time_start
print('Time taken: {:.0f} min {:.0f} secs'.format(*divmod(time_taken,60)))
display(df_eval)
Learning rate set to 0.095119 0: learn: 0.4064239 total: 139ms remaining: 2m 18s 100: learn: 0.0014365 total: 5.84s remaining: 52s 200: learn: 0.0009553 total: 11.4s remaining: 45.2s 300: learn: 0.0006789 total: 16.9s remaining: 39.2s 400: learn: 0.0004701 total: 22.1s remaining: 33.1s 500: learn: 0.0003296 total: 27.1s remaining: 27s 600: learn: 0.0002371 total: 32.3s remaining: 21.4s 700: learn: 0.0001719 total: 39.5s remaining: 16.8s 800: learn: 0.0001354 total: 44.5s remaining: 11.1s 900: learn: 0.0001058 total: 49.9s remaining: 5.48s 999: learn: 0.0000872 total: 56.4s remaining: 0us Learning rate set to 0.043056 0: learn: 0.5635981 total: 22.7ms remaining: 22.7s 100: learn: 0.0017777 total: 1.93s remaining: 17.2s 200: learn: 0.0007480 total: 3.76s remaining: 14.9s 300: learn: 0.0003691 total: 5.59s remaining: 13s 400: learn: 0.0002480 total: 7.23s remaining: 10.8s 500: learn: 0.0001734 total: 9.1s remaining: 9.06s 600: learn: 0.0001340 total: 10.9s remaining: 7.23s 700: learn: 0.0001134 total: 12.7s remaining: 5.42s 800: learn: 0.0000977 total: 14.5s remaining: 3.59s 900: learn: 0.0000858 total: 16.3s remaining: 1.79s 999: learn: 0.0000764 total: 18.2s remaining: 0us Learning rate set to 0.043056 0: learn: 0.5628806 total: 30ms remaining: 30s 100: learn: 0.0019698 total: 1.8s remaining: 16s 200: learn: 0.0009914 total: 3.61s remaining: 14.4s 300: learn: 0.0005191 total: 5.43s remaining: 12.6s 400: learn: 0.0003353 total: 7.51s remaining: 11.2s 500: learn: 0.0002580 total: 9.29s remaining: 9.25s 600: learn: 0.0002056 total: 11s remaining: 7.3s 700: learn: 0.0001702 total: 13.2s remaining: 5.64s 800: learn: 0.0001445 total: 15s remaining: 3.73s 900: learn: 0.0001232 total: 16.7s remaining: 1.83s 999: learn: 0.0001062 total: 18.3s remaining: 0us Time taken: 1 min 34 secs
| Model | Description | Accuracy | Precision | Recall | F1 | AUC | |
|---|---|---|---|---|---|---|---|
| 0 | catboost | default, seed=100 | 0.999456 | 0.999456 | 0.999456 | 0.999456 | 0.877489 |
# calculate the FPR and TPR for all thresholds of the classification
from sklearn import metrics
yprobs = model_cat.predict_proba(df_Xtest)
ypreds = yprobs[:,1]
fpr, tpr, threshold = metrics.roc_curve(ytest, ypreds)
roc_auc = metrics.auc(fpr, tpr)
plt.title('Receiver Operating Characteristic')
plt.plot(fpr, tpr, 'red', label = 'ROC AUC score = %0.2f' % roc_auc)
plt.legend(loc = 'lower right')
plt.plot([0, 1], [0, 1],'b--')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()
import eli5
# eli5.explain_weights_catboost(model_cat) # same thing
eli5.show_weights(model_cat)
| Weight | Feature |
|---|---|
| 0.0776 | 4 |
| 0.0752 | 1 |
| 0.0671 | 14 |
| 0.0566 | 0 |
| 0.0495 | 8 |
| 0.0462 | 9 |
| 0.0451 | 26 |
| 0.0430 | 12 |
| 0.0385 | 2 |
| 0.0378 | 29 |
| 0.0346 | 10 |
| 0.0323 | 19 |
| 0.0318 | 24 |
| 0.0297 | 6 |
| 0.0281 | 11 |
| 0.0280 | 28 |
| 0.0278 | 13 |
| 0.0243 | 25 |
| 0.0237 | 15 |
| 0.0236 | 18 |
| … 10 more … | |
df_Xtrain.head(2)
| Time | V1 | V2 | V3 | V4 | V5 | V6 | V7 | V8 | V9 | ... | V20 | V21 | V22 | V23 | V24 | V25 | V26 | V27 | V28 | Amount | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 35574 | 38177.0 | 1.430419 | -0.718078 | 0.364706 | -0.744257 | -0.556090 | 0.698948 | -0.949852 | 0.131008 | -0.314353 | ... | 0.158424 | 0.042013 | 0.429576 | -0.301931 | -0.933773 | 0.840490 | -0.027776 | 0.044688 | -0.007522 | 0.2 |
| 46862 | 42959.0 | -2.425523 | -1.790293 | 2.522139 | 0.581141 | 0.918453 | 0.594426 | 0.224541 | 0.373885 | -0.168411 | ... | 0.984535 | 0.538438 | 0.877560 | 0.590595 | -0.293545 | 0.524022 | -0.328189 | -0.205285 | -0.109163 | 300.0 |
2 rows × 30 columns
# time
time_start = time.time()
# current parameters
Xtr = df_Xtrain
ytr = ser_ytrain.to_numpy().ravel()
Xtx = df_Xtest
ytx = ser_ytest.to_numpy().ravel()
Xvd = df_Xvalid
yvd = ser_yvalid.to_numpy().ravel()
# fit the model
model = CatBoostClassifier(random_state=0,verbose=100)
model.fit(Xtr, ytr,
eval_set=(Xvd, yvd))
# ypreds
skf=StratifiedKFold(n_splits=5,shuffle=True,random_state=SEED)
ypreds = cross_val_predict(model, Xtx, ytx, cv=skf)
# r-squared values
r = roc_auc_score(ytx, ypreds)
# time
time_taken = time.time() - time_start
print('Time taken: {:.0f} min {:.0f} secs'.format(*divmod(time_taken,60)))
print('ROC AUC Score ', r)
Learning rate set to 0.114633 0: learn: 0.3248715 test: 0.3251081 best: 0.3251081 (0) total: 61ms remaining: 1m 100: learn: 0.0013095 test: 0.0025200 best: 0.0025142 (81) total: 5.34s remaining: 47.5s 200: learn: 0.0008262 test: 0.0025112 best: 0.0025092 (198) total: 10.4s remaining: 41.2s 300: learn: 0.0005809 test: 0.0025419 best: 0.0025035 (248) total: 15.5s remaining: 36s 400: learn: 0.0003816 test: 0.0025728 best: 0.0025035 (248) total: 20.6s remaining: 30.7s 500: learn: 0.0002286 test: 0.0025920 best: 0.0025035 (248) total: 25.6s remaining: 25.5s 600: learn: 0.0001514 test: 0.0026787 best: 0.0025035 (248) total: 30.7s remaining: 20.4s 700: learn: 0.0001180 test: 0.0027240 best: 0.0025035 (248) total: 35.7s remaining: 15.2s 800: learn: 0.0000933 test: 0.0027987 best: 0.0025035 (248) total: 40.8s remaining: 10.1s 900: learn: 0.0000752 test: 0.0028333 best: 0.0025035 (248) total: 45.9s remaining: 5.05s 999: learn: 0.0000652 test: 0.0028814 best: 0.0025035 (248) total: 51.1s remaining: 0us bestTest = 0.002503484959 bestIteration = 248 Shrink model to first 249 iterations. Learning rate set to 0.052624 0: learn: 0.5183665 total: 27.9ms remaining: 27.9s 100: learn: 0.0016191 total: 2.16s remaining: 19.3s 200: learn: 0.0008757 total: 4.23s remaining: 16.8s 300: learn: 0.0005459 total: 6.29s remaining: 14.6s 400: learn: 0.0003611 total: 8.35s remaining: 12.5s 500: learn: 0.0002252 total: 10.4s remaining: 10.4s 600: learn: 0.0001643 total: 12.5s remaining: 8.27s 700: learn: 0.0001302 total: 14.5s remaining: 6.19s 800: learn: 0.0001119 total: 16.5s remaining: 4.11s 900: learn: 0.0000986 total: 18.5s remaining: 2.04s 999: learn: 0.0000873 total: 20.5s remaining: 0us Learning rate set to 0.052624 0: learn: 0.5191442 total: 34.8ms remaining: 34.7s 100: learn: 0.0016154 total: 2.14s remaining: 19.1s 200: learn: 0.0007919 total: 4.21s remaining: 16.7s 300: learn: 0.0005020 total: 6.24s remaining: 14.5s 400: learn: 0.0003378 total: 8.28s remaining: 12.4s 500: learn: 0.0002149 total: 10.3s remaining: 10.3s 600: learn: 0.0001663 total: 12.3s remaining: 8.2s 700: learn: 0.0001313 total: 14.4s remaining: 6.16s 800: learn: 0.0001091 total: 16.6s remaining: 4.12s 900: learn: 0.0000915 total: 18.6s remaining: 2.04s 999: learn: 0.0000787 total: 20.6s remaining: 0us Learning rate set to 0.052625 0: learn: 0.5185525 total: 34.5ms remaining: 34.5s 100: learn: 0.0017444 total: 2.11s remaining: 18.8s 200: learn: 0.0009289 total: 4.14s remaining: 16.5s 300: learn: 0.0005160 total: 6.18s remaining: 14.3s 400: learn: 0.0003131 total: 8.21s remaining: 12.3s 500: learn: 0.0002227 total: 10.3s remaining: 10.3s 600: learn: 0.0001674 total: 12.4s remaining: 8.26s 700: learn: 0.0001281 total: 14.5s remaining: 6.17s 800: learn: 0.0001046 total: 16.9s remaining: 4.2s 900: learn: 0.0000894 total: 19.1s remaining: 2.1s 999: learn: 0.0000776 total: 21.3s remaining: 0us Learning rate set to 0.052625 0: learn: 0.5199170 total: 27ms remaining: 26.9s 100: learn: 0.0017970 total: 2.52s remaining: 22.4s 200: learn: 0.0010334 total: 4.6s remaining: 18.3s 300: learn: 0.0006874 total: 6.65s remaining: 15.4s 400: learn: 0.0003519 total: 8.74s remaining: 13.1s 500: learn: 0.0002494 total: 10.9s remaining: 10.9s 600: learn: 0.0001932 total: 13s remaining: 8.6s 700: learn: 0.0001543 total: 15s remaining: 6.4s 800: learn: 0.0001257 total: 17.1s remaining: 4.24s 900: learn: 0.0001065 total: 19.1s remaining: 2.1s 999: learn: 0.0000926 total: 21.2s remaining: 0us Learning rate set to 0.052625 0: learn: 0.5201697 total: 34.2ms remaining: 34.1s 100: learn: 0.0018228 total: 2.15s remaining: 19.1s 200: learn: 0.0009889 total: 4.19s remaining: 16.7s 300: learn: 0.0006222 total: 6.24s remaining: 14.5s 400: learn: 0.0003817 total: 8.28s remaining: 12.4s 500: learn: 0.0002598 total: 10.6s remaining: 10.6s 600: learn: 0.0001960 total: 12.7s remaining: 8.44s 700: learn: 0.0001505 total: 14.7s remaining: 6.29s 800: learn: 0.0001226 total: 16.8s remaining: 4.18s 900: learn: 0.0001071 total: 19s remaining: 2.08s 999: learn: 0.0000925 total: 21s remaining: 0us Time taken: 2 min 37 secs ROC AUC Score 0.8672765955003272
catboost tutorials model analysis feature statistics tutorial
# float feature
feature_name = 'Amount'
dict_stats = model.calc_feature_statistics(df_Xtrain, ser_ytrain, feature_name, plot=True)
# feature importance
df_imp = pd.DataFrame({'Feature': features,
'Importance': model.feature_importances_
})
df_imp.sort_values('Importance',ascending=False).style.background_gradient()
| Feature | Importance | |
|---|---|---|
| 4 | V4 | 9.231178 |
| 1 | V1 | 8.884553 |
| 12 | V12 | 7.858850 |
| 14 | V14 | 6.993834 |
| 8 | V8 | 5.448198 |
| 0 | Time | 5.067847 |
| 26 | V26 | 4.613358 |
| 11 | V11 | 3.664417 |
| 16 | V16 | 3.448854 |
| 6 | V6 | 3.389446 |
| 7 | V7 | 3.175937 |
| 29 | Amount | 3.115607 |
| 18 | V18 | 3.041911 |
| 10 | V10 | 2.904488 |
| 17 | V17 | 2.794143 |
| 25 | V25 | 2.587967 |
| 27 | V27 | 2.485769 |
| 19 | V19 | 2.457251 |
| 15 | V15 | 2.322782 |
| 2 | V2 | 2.310883 |
| 20 | V20 | 2.079404 |
| 13 | V13 | 1.815875 |
| 28 | V28 | 1.786981 |
| 3 | V3 | 1.770443 |
| 24 | V24 | 1.469989 |
| 22 | V22 | 1.362665 |
| 9 | V9 | 1.172512 |
| 5 | V5 | 1.104829 |
| 23 | V23 | 0.909606 |
| 21 | V21 | 0.730423 |
def plot_feature_imp_catboost(model_catboost,features):
"""Plot the feature importance horizontal bar plot.
"""
df_imp = pd.DataFrame({'Feature': model.feature_names_,
'Importance': model.feature_importances_
})
df_imp = df_imp.sort_values('Importance').set_index('Feature')
ax = df_imp.plot.barh(figsize=(12,8))
plt.grid(True)
plt.title('Feature Importance',fontsize=14)
ax.get_legend().remove()
for p in ax.patches:
x = p.get_width()
y = p.get_y()
text = '{:.2f}'.format(p.get_width())
ax.text(x, y,text,fontsize=15,color='indigo')
plt.show()
plot_feature_imp_catboost(model, features)
df_fimp = model.get_feature_importance(prettified=True)
df_fimp.head()
| Feature Id | Importances | |
|---|---|---|
| 0 | V4 | 9.231178 |
| 1 | V1 | 8.884553 |
| 2 | V12 | 7.858850 |
| 3 | V14 | 6.993834 |
| 4 | V8 | 5.448198 |
plt.figure(figsize=(12,8))
ax = sns.barplot(x=df_fimp.columns[1], y=df_fimp.columns[0], data=df_fimp);
for p in ax.patches:
x = p.get_width()
y = p.get_y()
text = '{:.2f}'.format(p.get_width())
ax.text(x, y,text,fontsize=15,color='indigo',va='top',ha='left')
from catboost import CatBoost, Pool
# help(CatBoost)
cat_features = [] # take it empty for the moment
dtrain = Pool(df_Xtrain, ser_ytrain, cat_features=cat_features)
dvalid = Pool(df_Xvalid, ser_yvalid, cat_features=cat_features)
dtest = Pool(df_Xtest, ser_ytest, cat_features=cat_features)
params_cat = {'iterations': 100, 'verbose': False,
'random_seed': 0,
'eval_metric':'AUC',
'loss_function':'Logloss',
'cat_features': [],
'ignored_features': [],
'early_stopping_rounds': 200,
'verbose': 200,
}
bst_cat = CatBoost(params=params_cat)
bst_cat.fit(dtrain,
eval_set=(df_Xvalid, ser_yvalid),
use_best_model=True,
plot=True);
print(bst_cat.eval_metrics(dtest, ['AUC'])['AUC'][-1])
Learning rate set to 0.312111 0: test: 0.9426860 best: 0.9426860 (0) total: 60.1ms remaining: 5.95s 99: test: 0.9732950 best: 0.9804994 (14) total: 6s remaining: 0us bestTest = 0.9804994003 bestIteration = 14 Shrink model to first 15 iterations. 0.9632516501958127
cv(pool=None, params=None, dtrain=None, iterations=None,
num_boost_round=None, fold_count=None, nfold=None, inverted=False,
partition_random_seed=0, seed=None, shuffle=True, logging_level=None,
stratified=None, as_pandas=True, metric_period=None, verbose=None,
verbose_eval=None, plot=False, early_stopping_rounds=None,
save_snapshot=None, snapshot_file=None,
snapshot_interval=None, folds=None, type='Classical')
params = {'iterations': 100, 'verbose': False,
'random_seed': 0,
'loss_function':'Logloss',
'eval_metric':'AUC',
}
df_scores = catboost.cv(dtrain,
params,
fold_count=2,
verbose=100,
shuffle=True,
stratified=True,
plot="True") # plot does not work in google colab
0: test: 0.9182109 best: 0.9182109 (0) total: 252ms remaining: 25s 99: test: 0.9769374 best: 0.9792743 (56) total: 9.15s remaining: 0us
print(df_scores.columns)
df_scores.head()
Index(['iterations', 'test-AUC-mean', 'test-AUC-std', 'test-Logloss-mean',
'test-Logloss-std', 'train-Logloss-mean', 'train-Logloss-std'],
dtype='object')
| iterations | test-AUC-mean | test-AUC-std | test-Logloss-mean | test-Logloss-std | train-Logloss-mean | train-Logloss-std | |
|---|---|---|---|---|---|---|---|
| 0 | 0 | 0.918211 | 0.015632 | 0.585840 | 0.001246 | 0.585823 | 0.001171 |
| 1 | 1 | 0.922383 | 0.027860 | 0.500689 | 0.002353 | 0.500659 | 0.002239 |
| 2 | 2 | 0.933871 | 0.022411 | 0.425035 | 0.003157 | 0.425024 | 0.003205 |
| 3 | 3 | 0.928061 | 0.020897 | 0.365778 | 0.003360 | 0.365737 | 0.003457 |
| 4 | 4 | 0.939572 | 0.017085 | 0.310018 | 0.004005 | 0.309959 | 0.003970 |
sns.lineplot(x='iterations',y='train-Logloss-mean',data=df_scores,ax=ax,color='r')
sns.lineplot(x='iterations',y='test-Logloss-mean',data=df_scores,ax=ax,color='b',alpha=0.2,linewidth=5,linestyle='--')
plt.show()
We generally should optimize model complexity and then tune the convergence.
model complexity: max_depth etc convergence: learning rate
Parameters:
model = CatBoostClassifier(verbose=100,random_state=SEED)
model.fit(df_Xtrain, ytr)
ypreds = model.predict(df_Xtest)
cm = confusion_matrix(ytest, ypreds)
print(cm)
Learning rate set to 0.095119 0: learn: 0.4064239 total: 57ms remaining: 56.9s 100: learn: 0.0014365 total: 5.09s remaining: 45.3s 200: learn: 0.0009553 total: 10.9s remaining: 43.5s 300: learn: 0.0006789 total: 16.8s remaining: 39.1s 400: learn: 0.0004701 total: 23.8s remaining: 35.5s 500: learn: 0.0003296 total: 29.7s remaining: 29.6s 600: learn: 0.0002371 total: 35.6s remaining: 23.7s 700: learn: 0.0001719 total: 40.7s remaining: 17.4s 800: learn: 0.0001354 total: 46.7s remaining: 11.6s 900: learn: 0.0001058 total: 52.8s remaining: 5.8s 999: learn: 0.0000872 total: 1m 1s remaining: 0us [[56861 3] [ 22 76]]
params = dict(verbose=500,
random_state=0,
iterations=3_000,
eval_metric='AUC',
cat_features = [],
early_stopping_rounds=200,
)
model = catboost.CatBoostClassifier(**params)
model.fit(df_Xtrain, ytrain,
eval_set=(df_Xvalid, yvalid),
use_best_model=True,
plot=False
);
Learning rate set to 0.071082 0: test: 0.9426860 best: 0.9426860 (0) total: 463ms remaining: 23m 8s 500: test: 0.9789769 best: 0.9789769 (500) total: 32s remaining: 2m 39s 1000: test: 0.9804130 best: 0.9814751 (885) total: 51.9s remaining: 1m 43s Stopped by overfitting detector (200 iterations wait) bestTest = 0.9814751329 bestIteration = 885 Shrink model to first 886 iterations.
time_start = time.time()
model = CatBoostClassifier(verbose=False,random_state=0,iterations=50)
model.fit(df_Xtrain, ser_ytrain)
ypreds = model.predict(df_Xtest)
cm = confusion_matrix(ytest, ypreds)
error = cm[0,1] + cm[1,0]
time_taken = time.time() - time_start
print('Time taken: {:.0f} min {:.0f} secs'.format(*divmod(time_taken,60)))
print('Errro confusion matrix', error)
# using 50 iterations is worse, use previous 1000.
Time taken: 0 min 2 secs Errro confusion matrix 26
for n in [6]: # default detpth = 6
model = CatBoostClassifier(verbose=False,random_state=0,
iterations=1_000,
depth=n,
)
model.fit(Xtr, ytr)
ypreds = model.predict(Xtx)
cm = confusion_matrix(ytest, ypreds)
error = cm[0,1] + cm[1,0]
print(f'Confusion matrix error count = {error} for n = {n}')
Confusion matrix error count = 26 for n = 6
for n in [0]:
model = CatBoostClassifier(verbose=False,random_state=n,
depth=6,
iterations=1_000,
)
model.fit(Xtr, ytr)
ypreds = model.predict(Xtx)
cm = confusion_matrix(ytest, ypreds)
error = cm[0,1] + cm[1,0]
print(f'Confusion matrix error count = {error} for n = {n}')
Confusion matrix error count = 26 for n = 0
import optuna
optuna.logging.set_verbosity(optuna.logging.WARNING) # use INFO to see progress
def objective(trial):
params_cat_optuna = {
'objective': trial.suggest_categorical('objective', ['Logloss', 'CrossEntropy']),
'colsample_bylevel': trial.suggest_uniform('colsample_bylevel', 0.01, 0.1),
'depth': trial.suggest_int('depth', 1, 12),
'boosting_type': trial.suggest_categorical('boosting_type', ['Ordered', 'Plain']),
'bootstrap_type': trial.suggest_categorical('bootstrap_type',
['Bayesian', 'Bernoulli', 'MVS']),
'used_ram_limit': '3gb'
}
# update parameters
if params_cat_optuna['bootstrap_type'] == 'Bayesian':
params_cat_optuna['bagging_temperature'] = trial.suggest_uniform('bagging_temperature', 0, 10)
elif params_cat_optuna['bootstrap_type'] == 'Bernoulli':
params_cat_optuna['subsample'] = trial.suggest_uniform('subsample', 0.1, 1)
# fit the model
model = CatBoostClassifier(random_state=SEED,**params_cat_optuna)
model.fit(df_Xtrain, ser_ytrain,
eval_set=[(df_Xvalid, ser_yvalid)],
verbose=0,
early_stopping_rounds=100)
ypreds = model.predict(df_Xvalid)
ypreds = np.rint(ypreds)
score = roc_auc_score(ser_yvalid.to_numpy().ravel(),
ypreds)
return score
# NOTE: there is inherent non-determinism in optuna hyperparameter selection
# we may not get the same hyperparameters when run twice.
sampler = optuna.samplers.TPESampler(seed=SEED)
N_TRIALS = 1 # make it large
study = optuna.create_study(direction='maximize',
sampler=sampler,
study_name='cat_optuna',
storage='sqlite:///cat_optuna_fraud_detection.db',
load_if_exists=True)
study.optimize(objective, n_trials=N_TRIALS,timeout=600)
# Resume from last time
sampler = optuna.samplers.TPESampler(seed=SEED)
N_TRIALS = 1 # make it large
study = optuna.create_study(direction='maximize',
sampler=sampler,
study_name='cat_optuna',
storage='sqlite:///cat_optuna_fraud_detection.db',
load_if_exists=True)
study.optimize(objective, n_trials=N_TRIALS)
print(f'Number of finished trials: {len(study.trials)}')
# best trail
best_trial = study.best_trial
# best params
params_best = study.best_trial.params
params_best
Number of finished trials: 2
{'bagging_temperature': 1.4860484007536512,
'boosting_type': 'Plain',
'bootstrap_type': 'Bayesian',
'colsample_bylevel': 0.07040400702545975,
'depth': 8,
'objective': 'Logloss'}
# time
time_start = time.time()
model_name = 'catboost'
desc = 'grid search optuna'
Xtr = df_Xtrain_orig
ytr = ser_ytrain_orig.to_numpy().ravel()
Xtx = df_Xtest
ytx = ser_ytest.to_numpy().ravel()
Xvd = df_Xvalid
yvd = ser_yvalid.to_numpy().ravel()
# use best model
params_best = study.best_trial.params
clf_lgb = clf_lgb = CatBoostClassifier(random_state=SEED,verbose=False)
clf_lgb.set_params(**params_best)
# fit and save the model
clf_lgb.fit(Xtr, ytr)
joblib.dump(clf_lgb,'../outputs/clf_cat_grid_search_optuna.pkl')
# load the saved model
clf_lgb = joblib.load('../outputs/clf_cat_grid_search_optuna.pkl')
# predictions
skf = StratifiedKFold(n_splits=2,shuffle=True,random_state=SEED)
ypreds_cv = cross_val_predict(clf_lgb, Xtx, ytx, cv=skf)
ypreds = ypreds_cv
# model evaluation
average = 'binary'
row_eval = [model_name,desc,
accuracy_score(ytx, ypreds),
precision_score(ytx, ypreds, average=average),
recall_score(ytx, ypreds, average=average),
f1_score(ytx, ypreds, average=average),
roc_auc_score(ytx, ypreds),
]
df_eval.loc[len(df_eval)] = row_eval
df_eval = df_eval.drop_duplicates()
time_taken = time.time() - time_start
print('Time taken: {:.0f} min {:.0f} secs'.format(*divmod(time_taken,60)))
display(df_eval)
Time taken: 0 min 52 secs
| Model | Description | Accuracy | Precision | Recall | F1 | AUC | |
|---|---|---|---|---|---|---|---|
| 0 | catboost | default, seed=100 | 0.999456 | 0.999456 | 0.999456 | 0.999456 | 0.877489 |
| 1 | catboost | grid search optuna | 0.999456 | 0.958904 | 0.714286 | 0.818713 | 0.857116 |
df_eval.sort_values('Recall',ascending=False).style.background_gradient(subset='Recall')
| Model | Description | Accuracy | Precision | Recall | F1 | AUC | |
|---|---|---|---|---|---|---|---|
| 0 | catboost | default, seed=100 | 0.999456 | 0.999456 | 0.999456 | 0.999456 | 0.877489 |
| 1 | catboost | grid search optuna | 0.999456 | 0.958904 | 0.714286 | 0.818713 | 0.857116 |
cm = confusion_matrix(ytest,ypreds)
vals = cm.ravel()
cm
array([[56861, 3],
[ 28, 70]])
print('Catboost Grid Search Results')
print('-'*25)
print('Total Frauds: ', vals[2] + vals[3])
print('Incorrect Frauds: ', vals[2])
print('Incorrect Percent: ', round(vals[2]*100/(vals[2]+vals[3]),2),'%')
Catboost Grid Search Results ------------------------- Total Frauds: 98 Incorrect Frauds: 28 Incorrect Percent: 28.57 %
from bhishan.bp import plotly_binary_clf_evaluation
yprobs = model.predict_proba(df_Xtest)
yprobs = yprobs[:,0] # take only first column
plotly_binary_clf_evaluation('clf_lgb_optuna',model,ytx,ypreds,yprobs,df)
yprobs
array([0.9998514 , 0.99999797, 0.99999704, ..., 0.9999982 , 0.9999997 ,
0.9999975 ])
%%time
model = CatBoostClassifier(verbose=False,random_state=100,
depth=6,
iterations=1_000,
)
model.fit(Xtr, ytr)
ypreds = model.predict(Xtx)
cm = confusion_matrix(ytest, ypreds)
error = cm[0,1] + cm[1,0]
print(f'Confusion matrix error count = {error} for n = {n}')
Confusion matrix error count = 21 for n = 0 CPU times: user 2min 20s, sys: 4.96 s, total: 2min 25s Wall time: 45.4 s
print(cm)
[[56864 0] [ 21 77]]
df_Xtrain.head(2).append(df_Xtest.head(2))
| Time | V1 | V2 | V3 | V4 | V5 | V6 | V7 | V8 | V9 | ... | V20 | V21 | V22 | V23 | V24 | V25 | V26 | V27 | V28 | Amount | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 35574 | 38177.0 | 1.430419 | -0.718078 | 0.364706 | -0.744257 | -0.556090 | 0.698948 | -0.949852 | 0.131008 | -0.314353 | ... | 0.158424 | 0.042013 | 0.429576 | -0.301931 | -0.933773 | 0.840490 | -0.027776 | 0.044688 | -0.007522 | 0.20 |
| 46862 | 42959.0 | -2.425523 | -1.790293 | 2.522139 | 0.581141 | 0.918453 | 0.594426 | 0.224541 | 0.373885 | -0.168411 | ... | 0.984535 | 0.538438 | 0.877560 | 0.590595 | -0.293545 | 0.524022 | -0.328189 | -0.205285 | -0.109163 | 300.00 |
| 248750 | 154078.0 | 0.046622 | 1.529678 | -0.453615 | 1.282569 | 1.110333 | -0.882716 | 1.046420 | -0.117121 | -0.679897 | ... | 0.240559 | -0.338472 | -0.839547 | 0.066527 | 0.836447 | 0.076790 | -0.775158 | 0.261012 | 0.058359 | 18.70 |
| 161573 | 114332.0 | 0.145870 | 0.107484 | 0.755127 | -0.995936 | 1.159107 | 2.113961 | 0.036200 | 0.471777 | 0.627622 | ... | -0.107332 | 0.297644 | 1.285809 | -0.140560 | -0.910706 | -0.449729 | -0.235203 | -0.036910 | -0.227111 | 9.99 |
4 rows × 30 columns
import eli5
eli5.show_weights(model)
| Weight | Feature |
|---|---|
| 0.1009 | V1 |
| 0.0653 | V4 |
| 0.0641 | V14 |
| 0.0604 | V26 |
| 0.0542 | Amount |
| 0.0389 | V12 |
| 0.0371 | V15 |
| 0.0369 | V10 |
| 0.0354 | V11 |
| 0.0333 | Time |
| 0.0298 | V8 |
| 0.0297 | V19 |
| 0.0281 | V13 |
| 0.0274 | V7 |
| 0.0273 | V20 |
| 0.0267 | V2 |
| 0.0255 | V3 |
| 0.0254 | V22 |
| 0.0253 | V16 |
| 0.0247 | V18 |
| … 10 more … | |
from eli5.sklearn import PermutationImportance
feature_names = df_Xtrain.columns.tolist()
perm = PermutationImportance(model).fit(df_Xtest, ytx)
eli5.show_weights(perm, feature_names=feature_names)
| Weight | Feature |
|---|---|
| 0.0008 ± 0.0001 | V14 |
| 0.0003 ± 0.0000 | V4 |
| 0.0002 ± 0.0000 | V10 |
| 0.0001 ± 0.0000 | Amount |
| 0.0001 ± 0.0001 | V26 |
| 0.0001 ± 0.0000 | V28 |
| 0.0001 ± 0.0000 | V17 |
| 0.0001 ± 0.0000 | V12 |
| 0.0001 ± 0.0000 | V1 |
| 0.0001 ± 0.0000 | V16 |
| 0.0000 ± 0.0000 | V19 |
| 0.0000 ± 0.0000 | V22 |
| 0.0000 ± 0.0000 | V20 |
| 0.0000 ± 0.0000 | V27 |
| 0.0000 ± 0.0000 | V8 |
| 0.0000 ± 0.0000 | V3 |
| 0.0000 ± 0.0000 | V6 |
| 0.0000 ± 0.0000 | V25 |
| 0.0000 ± 0.0000 | V11 |
| 0.0000 ± 0.0000 | V18 |
| … 10 more … | |
import lime
import lime.lime_tabular
idx = 0
example = df_Xtest.iloc[idx]
answer = ser_ytest.iloc[idx]
feature_names = df_Xtest.columns.tolist()
prediction = model.predict(example.to_numpy().reshape(-1,1).T)
print(f'answer = {answer}')
print('prediction = ', prediction[0])
print()
print(example)
print(feature_names)
answer = 0 prediction = 0 Time 154078.000000 V1 0.046622 V2 1.529678 V3 -0.453615 V4 1.282569 V5 1.110333 V6 -0.882716 V7 1.046420 V8 -0.117121 V9 -0.679897 V10 -0.923709 V11 0.371519 V12 -0.000047 V13 0.512255 V14 -2.091762 V15 0.786796 V16 0.159652 V17 1.706939 V18 0.458922 V19 0.037665 V20 0.240559 V21 -0.338472 V22 -0.839547 V23 0.066527 V24 0.836447 V25 0.076790 V26 -0.775158 V27 0.261012 V28 0.058359 Amount 18.700000 Name: 248750, dtype: float64 ['Time', 'V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10', 'V11', 'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19', 'V20', 'V21', 'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28', 'Amount']
import lime
import lime.lime_tabular
categorical_features = []
categorical_features_idx = [df_Xtrain.columns.get_loc(col) for col in categorical_features]
explainer = lime.lime_tabular.LimeTabularExplainer(df_Xtrain.to_numpy(),
feature_names=feature_names,
class_names=['Not-fraud','Fraud'],
categorical_features=categorical_features_idx,
mode='classification')
exp = explainer.explain_instance(example, model.predict_proba, num_features=8)
exp.show_in_notebook(show_table=True)
exp.as_pyplot_figure(); # use semicolon
import shap
shap.initjs()
model = CatBoostClassifier(verbose=100,random_state=100)
model.fit(df_Xtrain, ytrain)
explainer = shap.TreeExplainer(model)
shap_values = explainer.shap_values(df_Xtest)
Learning rate set to 0.095119 0: learn: 0.4064239 total: 77.9ms remaining: 1m 17s 100: learn: 0.0014365 total: 3.63s remaining: 32.3s 200: learn: 0.0009553 total: 7.06s remaining: 28.1s 300: learn: 0.0006789 total: 10.5s remaining: 24.4s 400: learn: 0.0004701 total: 13.8s remaining: 20.7s 500: learn: 0.0003296 total: 17.5s remaining: 17.5s 600: learn: 0.0002371 total: 21.1s remaining: 14s 700: learn: 0.0001719 total: 25.4s remaining: 10.8s 800: learn: 0.0001354 total: 28.9s remaining: 7.17s 900: learn: 0.0001058 total: 32.2s remaining: 3.54s 999: learn: 0.0000872 total: 36s remaining: 0us
Setting feature_perturbation = "tree_path_dependent" because no background data was given.
df_Xtest.head(1)
| Time | V1 | V2 | V3 | V4 | V5 | V6 | V7 | V8 | V9 | ... | V20 | V21 | V22 | V23 | V24 | V25 | V26 | V27 | V28 | Amount | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 248750 | 154078.0 | 0.046622 | 1.529678 | -0.453615 | 1.282569 | 1.110333 | -0.882716 | 1.04642 | -0.117121 | -0.679897 | ... | 0.240559 | -0.338472 | -0.839547 | 0.066527 | 0.836447 | 0.07679 | -0.775158 | 0.261012 | 0.058359 | 18.7 |
1 rows × 30 columns
df_Xtest.head(1)['V15 V18 V3 V24 V1 V8 V4 V14 V2 V6 V9 V20'.split()].round(4)
| V15 | V18 | V3 | V24 | V1 | V8 | V4 | V14 | V2 | V6 | V9 | V20 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 248750 | 0.7868 | 0.4589 | -0.4536 | 0.8364 | 0.0466 | -0.1171 | 1.2826 | -2.0918 | 1.5297 | -0.8827 | -0.6799 | 0.2406 |
# Look only first row of test data
# use matplotlib=True to avoid Javascript
idx = 0
shap.force_plot(explainer.expected_value,
shap_values[idx,:],
df_Xtest.iloc[idx,:],
matplotlib=False,
text_rotation=90)
# for this row, the predicted label is -9.33
# red features makes it higher
# blue features makes it smaller.
NUM = 100
shap.force_plot(explainer.expected_value, shap_values[:NUM,:],
df_Xtest.iloc[:NUM,:],matplotlib=False)
shap.summary_plot(shap_values, df_Xtest)
shap.summary_plot(shap_values, df_Xtest, plot_type='bar')
shap.dependence_plot("Amount", shap_values, df_Xtest)
shap.dependence_plot(ind='Time', interaction_index='Amount',
shap_values=shap_values,
features=df_Xtest,
display_features=df_Xtest)
notebook_end_time = time.time()
time_taken = time.time() - notebook_start_time
h,m = divmod(time_taken,60*60)
print('Time taken to run whole noteook: {:.0f} hr {:.0f} min {:.0f} secs'.format(h, *divmod(m,60)))
Time taken to run whole noteook: 0 hr 18 min 59 secs